In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
In [1]:
jupyter nbconvert --to html "churn_Modelling assignment.ipynb"
Cell In[1], line 1 jupyter nbconvert --to html "churn_Modelling assignment.ipynb" ^ SyntaxError: invalid syntax
In [9]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
In [10]:
#load dataset
df = pd.read_csv('promotion_dataset.csv')
df.head()
Out[10]:
| EmployeeNo | Division | Qualification | Gender | Channel_of_Recruitment | Trainings_Attended | Year_of_birth | Last_performance_score | Year_of_recruitment | Targets_met | Previous_Award | Training_score_average | State_Of_Origin | Foreign_schooled | Marital_Status | Past_Disciplinary_Action | Previous_IntraDepartmental_Movement | No_of_previous_employers | Promoted_or_Not | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | YAK/S/00001 | Commercial Sales and Marketing | MSc MBA and PhD | Female | Direct Internal process | 2 | 1986 | 12.5 | 2011 | 1 | 0 | 41 | ANAMBRA | No | Married | No | No | 0 | 0 |
| 1 | YAK/S/00002 | Customer Support and Field Operations | First Degree or HND | Male | Agency and others | 2 | 1991 | 12.5 | 2015 | 0 | 0 | 52 | ANAMBRA | Yes | Married | No | No | 0 | 0 |
| 2 | YAK/S/00003 | Commercial Sales and Marketing | First Degree or HND | Male | Direct Internal process | 2 | 1987 | 7.5 | 2012 | 0 | 0 | 42 | KATSINA | Yes | Married | No | No | 0 | 0 |
| 3 | YAK/S/00004 | Commercial Sales and Marketing | First Degree or HND | Male | Agency and others | 3 | 1982 | 2.5 | 2009 | 0 | 0 | 42 | NIGER | Yes | Single | No | No | 1 | 0 |
| 4 | YAK/S/00006 | Information and Strategy | First Degree or HND | Male | Direct Internal process | 3 | 1990 | 7.5 | 2012 | 0 | 0 | 77 | AKWA IBOM | Yes | Married | No | No | 1 | 0 |
In [11]:
# getting information about dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 38312 entries, 0 to 38311 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 EmployeeNo 38312 non-null object 1 Division 38312 non-null object 2 Qualification 36633 non-null object 3 Gender 38312 non-null object 4 Channel_of_Recruitment 38312 non-null object 5 Trainings_Attended 38312 non-null int64 6 Year_of_birth 38312 non-null int64 7 Last_performance_score 38312 non-null float64 8 Year_of_recruitment 38312 non-null int64 9 Targets_met 38312 non-null int64 10 Previous_Award 38312 non-null int64 11 Training_score_average 38312 non-null int64 12 State_Of_Origin 38312 non-null object 13 Foreign_schooled 38312 non-null object 14 Marital_Status 38312 non-null object 15 Past_Disciplinary_Action 38312 non-null object 16 Previous_IntraDepartmental_Movement 38312 non-null object 17 No_of_previous_employers 38312 non-null int64 18 Promoted_or_Not 38312 non-null int64 dtypes: float64(1), int64(8), object(10) memory usage: 5.6+ MB
In [12]:
# more info on rows and collumns
df.shape
Out[12]:
(38312, 19)
In [13]:
#information about missing data
round((df.isnull().sum() / df.shape[0]) * 100, 2)
Out[13]:
EmployeeNo 0.00 Division 0.00 Qualification 4.38 Gender 0.00 Channel_of_Recruitment 0.00 Trainings_Attended 0.00 Year_of_birth 0.00 Last_performance_score 0.00 Year_of_recruitment 0.00 Targets_met 0.00 Previous_Award 0.00 Training_score_average 0.00 State_Of_Origin 0.00 Foreign_schooled 0.00 Marital_Status 0.00 Past_Disciplinary_Action 0.00 Previous_IntraDepartmental_Movement 0.00 No_of_previous_employers 0.00 Promoted_or_Not 0.00 dtype: float64
In [14]:
# handling missing data by filling with mode
df['Qualification'] = df['Qualification'].fillna(df['Qualification'].mode()[0])
In [15]:
#crosscheck missing data filled
round((df.isnull().sum() / df.shape[0]) * 100, 2)
Out[15]:
EmployeeNo 0.0 Division 0.0 Qualification 0.0 Gender 0.0 Channel_of_Recruitment 0.0 Trainings_Attended 0.0 Year_of_birth 0.0 Last_performance_score 0.0 Year_of_recruitment 0.0 Targets_met 0.0 Previous_Award 0.0 Training_score_average 0.0 State_Of_Origin 0.0 Foreign_schooled 0.0 Marital_Status 0.0 Past_Disciplinary_Action 0.0 Previous_IntraDepartmental_Movement 0.0 No_of_previous_employers 0.0 Promoted_or_Not 0.0 dtype: float64
In [16]:
# identify collumn data types
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Categorical columns:', cat_col)
print('Numerical columns:', num_col)
Categorical columns: ['EmployeeNo', 'Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement'] Numerical columns: ['Trainings_Attended', 'Year_of_birth', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'No_of_previous_employers', 'Promoted_or_Not']
In [17]:
#count unique data for categorical data types
df[cat_col].nunique()
Out[17]:
EmployeeNo 38312 Division 9 Qualification 3 Gender 2 Channel_of_Recruitment 3 State_Of_Origin 37 Foreign_schooled 2 Marital_Status 3 Past_Disciplinary_Action 2 Previous_IntraDepartmental_Movement 2 dtype: int64
In [18]:
#statistical summary of dataset
df.describe()
Out[18]:
| Trainings_Attended | Year_of_birth | Last_performance_score | Year_of_recruitment | Targets_met | Previous_Award | Training_score_average | No_of_previous_employers | Promoted_or_Not | |
|---|---|---|---|---|---|---|---|---|---|
| count | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 | 38312.000000 |
| mean | 2.253680 | 1986.209334 | 7.698959 | 2013.139695 | 0.352996 | 0.023152 | 55.366465 | 1.040953 | 0.084595 |
| std | 0.609443 | 7.646047 | 3.744135 | 4.261451 | 0.477908 | 0.150388 | 13.362741 | 1.235738 | 0.278282 |
| min | 2.000000 | 1950.000000 | 0.000000 | 1982.000000 | 0.000000 | 0.000000 | 31.000000 | 0.000000 | 0.000000 |
| 25% | 2.000000 | 1982.000000 | 5.000000 | 2012.000000 | 0.000000 | 0.000000 | 43.000000 | 0.000000 | 0.000000 |
| 50% | 2.000000 | 1988.000000 | 7.500000 | 2014.000000 | 0.000000 | 0.000000 | 52.000000 | 1.000000 | 0.000000 |
| 75% | 2.000000 | 1992.000000 | 10.000000 | 2016.000000 | 1.000000 | 0.000000 | 68.000000 | 1.000000 | 0.000000 |
| max | 11.000000 | 2001.000000 | 12.500000 | 2018.000000 | 1.000000 | 1.000000 | 91.000000 | 6.000000 | 1.000000 |
In [19]:
# generate profiling report
from ydata_profiling import ProfileReport
df.profile_report()
Upgrade to ydata-sdk
Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 27.90it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[19]:
In [20]:
#correlation analysis
df.corr(numeric_only=True)
Out[20]:
| Trainings_Attended | Year_of_birth | Last_performance_score | Year_of_recruitment | Targets_met | Previous_Award | Training_score_average | No_of_previous_employers | Promoted_or_Not | |
|---|---|---|---|---|---|---|---|---|---|
| Trainings_Attended | 1.000000 | 0.078710 | -0.062042 | 0.056215 | -0.044789 | -0.007409 | 0.041065 | 0.000796 | -0.024345 |
| Year_of_birth | 0.078710 | 1.000000 | -0.175572 | 0.654666 | 0.025337 | 0.013627 | 0.048390 | -0.003117 | 0.017991 |
| Last_performance_score | -0.062042 | -0.175572 | 1.000000 | -0.190333 | 0.276350 | 0.026587 | 0.057836 | -0.005428 | 0.119690 |
| Year_of_recruitment | 0.056215 | 0.654666 | -0.190333 | 1.000000 | 0.076910 | 0.041995 | 0.037477 | -0.003550 | 0.012287 |
| Targets_met | -0.044789 | 0.025337 | 0.276350 | 0.076910 | 1.000000 | 0.092934 | 0.077201 | -0.003308 | 0.224518 |
| Previous_Award | -0.007409 | 0.013627 | 0.026587 | 0.041995 | 0.092934 | 1.000000 | 0.072360 | 0.003887 | 0.201434 |
| Training_score_average | 0.041065 | 0.048390 | 0.057836 | 0.037477 | 0.077201 | 0.072360 | 1.000000 | 0.008194 | 0.178448 |
| No_of_previous_employers | 0.000796 | -0.003117 | -0.005428 | -0.003550 | -0.003308 | 0.003887 | 0.008194 | 1.000000 | 0.001690 |
| Promoted_or_Not | -0.024345 | 0.017991 | 0.119690 | 0.012287 | 0.224518 | 0.201434 | 0.178448 | 0.001690 | 1.000000 |
In [21]:
sns.countplot(x='Promoted_or_Not', data=df, hue='Promoted_or_Not', palette='Set1')
plt.title('promotion distribution')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\2511900159.py:3: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown plt.show()
In [22]:
plt.boxplot(df['Training_score_average'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Targets_met')
plt.title('Box Plot')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3987866815.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown plt.show()
In [23]:
plt.boxplot(df['Last_performance_score'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Last_performance_score')
plt.title('Box Plot')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3335382509.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown plt.show()
In [24]:
plt.boxplot(df['Year_of_recruitment'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Year_of_recruitment')
plt.title('corr Plot in search of outliers')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3332865500.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown plt.show()
In [25]:
# feature engineering - creating new feature 'Age' from 'Year_of_birth' to drop redundant data
df["Age"] = 2025 - df["Year_of_birth"]
df.head()
Out[25]:
| EmployeeNo | Division | Qualification | Gender | Channel_of_Recruitment | Trainings_Attended | Year_of_birth | Last_performance_score | Year_of_recruitment | Targets_met | Previous_Award | Training_score_average | State_Of_Origin | Foreign_schooled | Marital_Status | Past_Disciplinary_Action | Previous_IntraDepartmental_Movement | No_of_previous_employers | Promoted_or_Not | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | YAK/S/00001 | Commercial Sales and Marketing | MSc MBA and PhD | Female | Direct Internal process | 2 | 1986 | 12.5 | 2011 | 1 | 0 | 41 | ANAMBRA | No | Married | No | No | 0 | 0 | 39 |
| 1 | YAK/S/00002 | Customer Support and Field Operations | First Degree or HND | Male | Agency and others | 2 | 1991 | 12.5 | 2015 | 0 | 0 | 52 | ANAMBRA | Yes | Married | No | No | 0 | 0 | 34 |
| 2 | YAK/S/00003 | Commercial Sales and Marketing | First Degree or HND | Male | Direct Internal process | 2 | 1987 | 7.5 | 2012 | 0 | 0 | 42 | KATSINA | Yes | Married | No | No | 0 | 0 | 38 |
| 3 | YAK/S/00004 | Commercial Sales and Marketing | First Degree or HND | Male | Agency and others | 3 | 1982 | 2.5 | 2009 | 0 | 0 | 42 | NIGER | Yes | Single | No | No | 1 | 0 | 43 |
| 4 | YAK/S/00006 | Information and Strategy | First Degree or HND | Male | Direct Internal process | 3 | 1990 | 7.5 | 2012 | 0 | 0 | 77 | AKWA IBOM | Yes | Married | No | No | 1 | 0 | 35 |
In [ ]:
#separating features and target variable
X = df[['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment','Age', 'Trainings_Attended', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'Foreign_schooled', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers']]
Y = df['Promoted_or_Not']
In [27]:
# scaling numerical features
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
x1 = X.copy()
scaler = MinMaxScaler(feature_range=(0, 1))
num_col_ = [col for col in X.columns if X[col].dtype != 'object']
num_cols = x1.select_dtypes(include=['int64', 'float64']).columns
x1[num_col_] = scaler.fit_transform(x1[num_col_])
# encoding categorical features
cat_cols = x1.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
x1[col] = le.fit_transform(x1[col])
x1.head()
Out[27]:
| Division | Qualification | Gender | Channel_of_Recruitment | Age | Trainings_Attended | Last_performance_score | Year_of_recruitment | Targets_met | Previous_Award | Training_score_average | Foreign_schooled | Past_Disciplinary_Action | Previous_IntraDepartmental_Movement | No_of_previous_employers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 1 | 0.294118 | 0.000000 | 1.0 | 0.805556 | 1.0 | 0.0 | 0.166667 | 0 | 0 | 0 | 0.000000 |
| 1 | 2 | 0 | 1 | 0 | 0.196078 | 0.000000 | 1.0 | 0.916667 | 0.0 | 0.0 | 0.350000 | 1 | 0 | 0 | 0.000000 |
| 2 | 1 | 0 | 1 | 1 | 0.274510 | 0.000000 | 0.6 | 0.833333 | 0.0 | 0.0 | 0.183333 | 1 | 0 | 0 | 0.000000 |
| 3 | 1 | 0 | 1 | 0 | 0.372549 | 0.111111 | 0.2 | 0.750000 | 0.0 | 0.0 | 0.183333 | 1 | 0 | 0 | 0.166667 |
| 4 | 4 | 0 | 1 | 1 | 0.215686 | 0.111111 | 0.6 | 0.833333 | 0.0 | 0.0 | 0.766667 | 1 | 0 | 0 | 0.166667 |
In [28]:
# standardizing numerical features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(x1)
print(X_standardized[:5])
[[-0.83158796 1.41479513 -1.53339317 0.99209997 0.02737841 -0.41625517 1.28229978 -0.5021114 1.35384256 -0.15395043 -1.07512768 -3.24810652 -0.06290405 -0.32218928 -0.84238477] [-0.41929682 -0.61751078 0.65214846 -0.86183301 -0.62656278 -0.41625517 1.28229978 0.43654831 -0.73863832 -0.15395043 -0.25193251 0.30787168 -0.06290405 -0.32218928 -0.84238477] [-0.83158796 -0.61751078 0.65214846 0.99209997 -0.10340983 -0.41625517 -0.05313941 -0.26744648 -0.73863832 -0.15395043 -1.00029176 0.30787168 -0.06290405 -0.32218928 -0.84238477] [-0.83158796 -0.61751078 0.65214846 -0.86183301 0.55053137 1.22460994 -1.3885786 -0.97144126 -0.73863832 -0.15395043 -1.00029176 0.30787168 -0.06290405 -0.32218928 -0.03314114] [ 0.40528546 -0.61751078 0.65214846 0.99209997 -0.49577454 1.22460994 -0.05313941 -0.26744648 -0.73863832 -0.15395043 1.61896561 0.30787168 -0.06290405 -0.32218928 -0.03314114]]
In [29]:
# preparing for train-test split
categorical_features_X = X.select_dtypes(include=['object']).columns.tolist()
numerical_features_X = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features_X),
("num", StandardScaler(), numerical_features_X)
]
)
In [30]:
# train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
In [31]:
# building random forest model
rf_model = Pipeline(steps=[
('preprocess', preprocessor),
('model', RandomForestClassifier(
n_estimators=300,
max_depth=None,
random_state=42,
class_weight='balanced' # handle imbalanced classes
))
])
# Fit
rf_model.fit(X_train, Y_train)
Out[31]:
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['Division', 'Qualification',
'Gender',
'Channel_of_Recruitment',
'Foreign_schooled',
'Past_Disciplinary_Action',
'Previous_IntraDepartmental_Movement']),
('num', StandardScaler(),
['Age', 'Trainings_Attended',
'Last_performance_score',
'Year_of_recruitment',
'Targets_met',
'Previous_Award',
'Training_score_average',
'No_of_previous_employers'])])),
('model',
RandomForestClassifier(class_weight='balanced',
n_estimators=300, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['Division', 'Qualification',
'Gender',
'Channel_of_Recruitment',
'Foreign_schooled',
'Past_Disciplinary_Action',
'Previous_IntraDepartmental_Movement']),
('num', StandardScaler(),
['Age', 'Trainings_Attended',
'Last_performance_score',
'Year_of_recruitment',
'Targets_met',
'Previous_Award',
'Training_score_average',
'No_of_previous_employers'])])),
('model',
RandomForestClassifier(class_weight='balanced',
n_estimators=300, random_state=42))])ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
['Division', 'Qualification', 'Gender',
'Channel_of_Recruitment', 'Foreign_schooled',
'Past_Disciplinary_Action',
'Previous_IntraDepartmental_Movement']),
('num', StandardScaler(),
['Age', 'Trainings_Attended',
'Last_performance_score',
'Year_of_recruitment', 'Targets_met',
'Previous_Award', 'Training_score_average',
'No_of_previous_employers'])])['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'Foreign_schooled', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement']
OneHotEncoder(handle_unknown='ignore')
['Age', 'Trainings_Attended', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'No_of_previous_employers']
StandardScaler()
RandomForestClassifier(class_weight='balanced', n_estimators=300,
random_state=42)In [32]:
# accuracy and classification report
rf_model.fit(X_train, Y_train)
rf_preds = rf_model.predict(X_test)
print("\n=== RANDOM FOREST RESULTS ===")
print("Accuracy:", accuracy_score(Y_test, rf_preds))
print(classification_report(Y_test, rf_preds))
=== RANDOM FOREST RESULTS ===
Accuracy: 0.9323449571935686
precision recall f1-score support
0 0.93 1.00 0.96 8768
1 0.84 0.25 0.38 810
accuracy 0.93 9578
macro avg 0.89 0.62 0.67 9578
weighted avg 0.93 0.93 0.91 9578
In [1]:
import xgboost
from xgboost import XGBClassifier
In [4]:
xgb_model = Pipeline(steps=[
("preprocess", preprocessor),
("model", XGBClassifier(
n_estimators=350,
learning_rate=0.1,
max_depth=6,
subsample=0.9,
colsample_bytree=0.8,
random_state=42,
eval_metric="logloss"
))
])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 1 ----> 1 xgb_model = Pipeline(steps=[ 2 ("preprocess", preprocessor), 3 ("model", XGBClassifier( 4 n_estimators=350, 5 learning_rate=0.1, 6 max_depth=6, 7 subsample=0.9, 8 colsample_bytree=0.8, 9 random_state=42, 10 eval_metric="logloss" 11 )) 12 ]) NameError: name 'Pipeline' is not defined
In [5]:
import sys
print(sys.executable)
c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe
In [9]:
import sys
print(sys.executable)
c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe
In [8]:
!py -m pip install xgboost
Requirement already satisfied: xgboost in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (3.1.2) Requirement already satisfied: numpy in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (from xgboost) (2.3.5) Requirement already satisfied: scipy in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (from xgboost) (1.16.3)